/*==============================================================================
FILE NAME: Process_Notice_of_Violations.do
INPUTS: Notices_of_violation.dta
OUTPUTS: Notice_of_Violation_Clean.dta
UPDATED: 16 June 2022
==============================================================================*/

/* Set directory if working independently through code */
if c(username)=="" { //insert username
    global rootdir "" // insert root path
    global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 

// Prevent Stata from pausing output
set more off

// Load raw notices of violation data
use "$processed_data/Notices_of_violation.dta", clear

// Keep only relevant columns and remove duplicates
keep RegulatedEntityNo ViolationTrackNo NOVDate InvestigationStartDate Media InvestigationNo ViolationCategory
duplicates drop

// Rename columns for clarity
rename NOVDate VN_Date
rename RegulatedEntityNo RN
rename ViolationTrackNo VN
rename InvestigationNo IN

// Count matches and mismatches between NOV date and investigation start date
count if VN_Date==InvestigationStartDate
count if VN_Date!=InvestigationStartDate
count if VN_Date==InvestigationStartDate & Media=="AIR"
count if VN_Date!=InvestigationStartDate & Media=="AIR"

// Create violation indicator
gen violation = 1

// Create media-specific violation indicators
gen violation_air = 1 if Media == "AIR"
replace violation_air = 0 if Media == "WATER" | Media == "WASTE"
gen violation_water = 1 if Media == "WATER"
replace violation_water = 0 if Media == "AIR" | Media == "WASTE"
gen violation_waste = 1 if Media == "WASTE"
replace violation_waste = 0 if Media == "AIR" | Media == "WATER"

// Rename Media column for clarity
rename Media Media_NOV

// Extract day, month, and year from NOV date
gen day = day(VN_Date)
gen month = month(VN_Date)
gen year = year(VN_Date)

// Save cleaned notices of violation data
save "$processed_data/Notice_of_Violation_Clean.dta", replace
